Dans ce Notebook, je propose d' appliquer les algorithmes K-means et DBscan pour effectuer notre segmentation. Nous pourrons ainsi comparer nos résultats et choisir la solution la mieux adaptée à notre problématique.
from datetime import datetime, timedelta
from kmodes.kprototypes import KPrototypes
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn import metrics
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance
import importlib.metadata
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import re
import seaborn as sns
!python --version
print('\n'.join(f'{m.__name__} {m.__version__}'
for m in globals().values()
if getattr(m, '__version__', None)))
plt.rcParams.update({'font.size': 18})
Python 3.9.7 missingno 0.4.1 numpy 1.23.5 pandas 1.3.4 re 2.2.1 seaborn 0.12.2
def boxplotdistandlog(variable,a,b,titre):
f, ax = plt.subplots(2, 2, figsize = (a,b), gridspec_kw = {"height_ratios": (.15, .85)})
sns.boxplot(variable, ax = ax[0, 0], orient = "h")
ax[0, 0].set(xlabel='')
ax[0, 0].set_title("")
sns.histplot(variable, ax = ax[1, 0])
ax[1, 0].set(xlabel=titre)
variable_log = np.log(variable + 1)
sns.boxplot(variable_log, ax = ax[0, 1], orient = "h")
ax[0, 1].set(xlabel='')
ax[0, 1].set_title("")
sns.histplot(variable_log, ax = ax[1, 1])
ax[1, 1].set(xlabel='log('+ titre + '+1)')
plt.show()
print("coefficient d'asymétrie:", variable.skew())
print("coefficient d'asymétrie après transformation:", np.log(variable+1).skew())
def boxplotdistandlog2(variable,a,b,titre):
f, ax = plt.subplots(2, 2, figsize = (a,b), gridspec_kw = {"height_ratios": (.15, .85)})
sns.boxplot(variable, ax = ax[0, 0], orient = "h")
ax[0, 0].set(xlabel='')
ax[0, 0].set_title("")
sns.histplot(variable, ax = ax[1, 0])
ax[1, 0].set(xlabel=titre)
variable_log = np.log(variable )
sns.boxplot(variable_log, ax = ax[0, 1], orient = "h")
ax[0, 1].set(xlabel='')
ax[0, 1].set_title("")
sns.histplot(variable_log, ax = ax[1, 1])
ax[1, 1].set(xlabel='log('+ titre + ')')
plt.show()
print("coefficient d'asymétrie:", variable.skew())
print("coefficient d'asymétrie après transformation:", np.log(variable+1).skew())
df_synt=pd.read_csv('df_synt.csv')
# transformation en datetime
col_time=['order_purchase_timestamp','order_approved_at',
'order_delivered_carrier_date','order_delivered_customer_date',
'order_estimated_delivery_date']
for date in col_time:
df_synt[date]= pd.to_datetime(df_synt[date])#,format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_synt.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 115093 entries, 0 to 115092 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 115093 non-null object 1 customer_id 115093 non-null object 2 order_purchase_timestamp 115093 non-null datetime64[ns] 3 order_approved_at 115093 non-null datetime64[ns] 4 order_delivered_carrier_date 115093 non-null datetime64[ns] 5 order_delivered_customer_date 115093 non-null datetime64[ns] 6 order_estimated_delivery_date 115093 non-null datetime64[ns] 7 customer_unique_id 115093 non-null object 8 customer_zip_code_prefix 115093 non-null int64 9 review_score 114236 non-null float64 10 review_comment_title 13559 non-null object 11 review_comment_message 47730 non-null object 12 review_creation_date 114236 non-null object 13 review_answer_timestamp 114236 non-null object 14 Satisfaction 114236 non-null object 15 Satisfaction_num 115093 non-null float64 16 order_item_id 115093 non-null int64 17 product_id 115093 non-null object 18 seller_id 115093 non-null object 19 shipping_limit_date 115093 non-null object 20 price 115093 non-null float64 21 freight_value 115093 non-null float64 22 product_category_name 113459 non-null object 23 payment_sequential 115093 non-null float64 24 payment_type 115093 non-null object 25 payment_installments 115093 non-null float64 26 payment_value 115093 non-null float64 27 nb_payment_sequential 115093 non-null float64 28 sum_payment_installments 115093 non-null float64 29 product_category_name_english 113436 non-null object 30 distance_client_vendeur 114791 non-null float64 31 product_vol_cm3 115073 non-null float64 32 recence 115093 non-null int64 33 delta_order_approved_at 115093 non-null int64 34 delta_order_delivered_carrier_date 115093 non-null int64 35 delta_order_delivered_customer_date 115093 non-null int64 36 differentiel_date_prevue 115093 non-null int64 37 delta_seller_to_carrier_status 115093 non-null int64 38 product_category 113436 non-null object 39 ponctualité_livraison_vendeur_relais 115093 non-null object 40 ponctualité_livraison_client 115093 non-null object 41 sale_month 115093 non-null int64 dtypes: datetime64[ns](5), float64(11), int64(9), object(17) memory usage: 36.9+ MB
rfm= df_synt.groupby('customer_unique_id').agg({'recence': 'min', 'order_id': 'nunique', 'payment_value' : 'sum'})
rfm.rename(columns={'order_id':'frequence','payment_value' : 'montant'}, inplace=True)
rfm
| recence | frequence | montant | |
|---|---|---|---|
| customer_unique_id | |||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 111 | 1 | 141.90 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 114 | 1 | 27.19 |
| 0000f46a3911fa3c0805444483337064 | 537 | 1 | 86.22 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 321 | 1 | 43.62 |
| 0004aac84e0df4da2b147fca70cf8255 | 288 | 1 | 196.89 |
| ... | ... | ... | ... |
| fffcf5a5ff07b0908bd4e2dbc735a684 | 447 | 1 | 4134.84 |
| fffea47cd6d3cc0a88bd621562a9d061 | 262 | 1 | 84.58 |
| ffff371b4d645b6ecea244b27531430a | 568 | 1 | 112.46 |
| ffff5962728ec6157033ef9805bacc48 | 119 | 1 | 133.69 |
| ffffd2657e2aad2907e67c3e9daecbeb | 484 | 1 | 71.56 |
92881 rows × 3 columns
boxplotdistandlog(rfm['montant'],10,7,'Montant')
coefficient d'asymétrie: 30.66234641308994 coefficient d'asymétrie après transformation: 0.7477694963766197
boxplotdistandlog(rfm['frequence'],10,7,'Frequence')
coefficient d'asymétrie: 11.142163940501918 coefficient d'asymétrie après transformation: 6.531905420071655
boxplotdistandlog(rfm['recence'],10,7,'Recence')
coefficient d'asymétrie: 0.4143281762296858 coefficient d'asymétrie après transformation: -1.256955393139069
# Traitement des outliers
for i in [0, 2]:
outlier_indices = []
col = rfm.columns[i]
percentile_5 = np.percentile(rfm[col], 5)
percentile_95 = np.percentile(rfm[col], 95)
outlier_indices.append(rfm[(rfm[col] < percentile_5) | (rfm[col] > percentile_95)].index)
rfm.drop(outlier_indices[0][:], inplace= True)
rfm.reset_index(inplace= True, drop= True)
# Transformation logarithmique des variables fréquence et montant
rfm['montant'] = np.log(rfm['montant']+1)
X=rfm
# pipeline preprocessing+kmeans
model= make_pipeline(RobustScaler(), KMeans(n_clusters=4,n_init=10,random_state=42))
model.fit(X)
model.predict(X)
y_clusters = model.fit_predict(X)
model.score(X)
-25716.940901949325
model.set_params
<bound method Pipeline.set_params of Pipeline(steps=[('robustscaler', RobustScaler()),
('kmeans', KMeans(n_clusters=4, n_init=10, random_state=42))])>
model.named_steps['kmeans'].cluster_centers_
array([[-0.40737255, 0.06174484, 0.65007034],
[ 0.7345856 , 0.00829372, -0.47057776],
[-0.3847552 , 0.00765568, -0.47982338],
[ 0.69748789, 0.0497503 , 0.66858419]])
def nuage_points_2d(abscisse, ordonnée, clusters):
plt.figure(figsize= [15, 8])
colors = ['purple', 'green', 'red', 'blue', 'orange', 'royalblue', 'yellow']
#sns.scatterplot(x= X.recence, y= X.montant, hue= model.predict(X), palette= colors)
sns.scatterplot(x= abscisse, y= ordonnée, hue= clusters, palette= colors)
plt.legend(prop={'size':10})
sns.despine()
def nuage_points_3d(dataframe, y_clust, figsizea, figsizeb, xlabel, ylabel, zlabel, zenit, azimut):
# On définit notre figure et notre axe différemment :
fig= plt.figure(1, figsize=(figsizea, figsizeb))
ax = fig.add_subplot(111, projection="3d", elev=zenit, azim=azimut)
# On affiche nos points :
ax.scatter(dataframe.iloc[:, 0],
dataframe.iloc[:, 1],
dataframe.iloc[:, 2],
c=y_clust,
cmap="Set1",
edgecolor='face',
s=40)
# On spécifie le nom des axes :
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_zlabel(zlabel)
nuage_points_3d(X, y_clusters, 10, 10, "recence", "frequence", "montant", -150, 50)
def clustering_eval(preprocessor, model, data, metric, elbow=True, mds=False, KBest=None):
if((elbow==True) & (mds==True)):
nrows=3
elif((elbow==False) | (mds==False)):
nrows=2
else:
nrows=1
fig, axes = plt.subplots(nrows=nrows, ncols=1, sharex=False, sharey=False, figsize=(10,20))
ax=0
if(elbow==True):
# Elbow visualizer
kmeans_visualizer = Pipeline([
("preprocessor", preprocessor),
("kelbowvisualizer", KElbowVisualizer(model,K=(1,12), metric=metric, ax=axes[ax]))])
kmeans_visualizer.fit(data)
KBest = kmeans_visualizer.named_steps['kelbowvisualizer'].elbow_value_
kmeans_visualizer.named_steps['kelbowvisualizer'].finalize()
ax+=1
# Set best K
K = KBest
model.set_params(n_clusters=K)
# Silhouette Visualizer
silhouette_visualizer = Pipeline([
("preprocessor", preprocessor),
("silhouettevisualizer", SilhouetteVisualizer(model, ax=axes[ax]))])
silhouette_visualizer.fit(data)
silhouette_visualizer.named_steps['silhouettevisualizer'].finalize()
ax+=1
# Intercluster distance Map with best k
if(mds==True):
distance_visualizer = Pipeline([
("preprocessor", preprocessor),
("distancevisualizer", InterclusterDistance(model, ax=axes[ax]))])
distance_visualizer.fit(data)
distance_visualizer.named_steps['distancevisualizer'].finalize()
return K
plt.show()
K=clustering_eval(preprocessor=RobustScaler(),
model=KMeans(n_init=10,random_state=42),
data=X,
metric="distortion",
elbow=True,
mds=False,
KBest=None)
calinski_visualizer = make_pipeline(RobustScaler(), KElbowVisualizer(KMeans(n_init=10,random_state=42),
k=(2,12),
metric='calinski_harabasz',
timings=False ))
calinski_visualizer.fit(X) # Fit the data to the visualizer
calinski_visualizer.named_steps['kelbowvisualizer'].show()
<Axes: title={'center': 'Calinski Harabasz Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='calinski harabasz score'>
def plot_radars(df, group):
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df),
index=df.index,
columns=df.columns).reset_index()
fig = go.Figure()
for k in df[group]:
fig.add_trace(go.Scatterpolar(
r=df[df[group]==k].iloc[:,1:].values.reshape(-1),
theta=df.columns[1:],
fill='toself',
name='Cluster '+str(k)
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 1]
)),
showlegend=True,
title={
'text': "Comparaison des moyennes par variable des clusters",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
title_font_color="blue",
title_font_size=18)
fig.show()
# KMeans Pipeline with best K
kmeans_model_prime = make_pipeline(RobustScaler(), KMeans(n_clusters=K,n_init=10,random_state=42))
kmeans_model_prime.fit(X)
y_clusters = kmeans_model_prime.fit_predict(X)
# Kmeans labels
kmeans_labels_prime = kmeans_model_prime.named_steps['kmeans'].labels_
# Scale X
scaler = RobustScaler()
X_scaled_prime = scaler.fit_transform(X)
X_scaled_prime = pd.DataFrame(X_scaled_prime, index=X.index, columns=X.columns)
X_scaled_prime["kmeans_label"] = kmeans_labels_prime
# Group by cluster
X_scaled_clusters_prime = X_scaled_prime.groupby("kmeans_label").mean()
# Plot Radar chart
plot_radars(X_scaled_clusters_prime, "kmeans_label")
def distri_clusters(df,variable):
sns.countplot(data = df, x = variable, palette = 'viridis_r')
plt.title('Cluster Sizes', fontsize = 15)
plt.xlabel('Clusters', fontsize = 14)
plt.ylabel('No. of Customers', fontsize = 14)
plt.xticks([0, 1, 2, 3], ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3'], rotation = 0)
plt.text(x = 0 - 0.2, y = df[variable].value_counts()[0] + 500, s = df[variable].value_counts()[0])
plt.text(x = 1 - 0.2, y = df[variable].value_counts()[1] + 500, s = df[variable].value_counts()[1])
plt.text(x = 2 - 0.15, y = df[variable].value_counts()[2] + 500, s = df[variable].value_counts()[2])
plt.text(x = 3 - 0.12, y = df[variable].value_counts()[3] + 500, s = df[variable].value_counts()[3])
#plt.text(x = 4 - 0.05, y = data_output.Cluster.value_counts()[4] + 500, s = data_output.Cluster.value_counts()[4])
plt.tight_layout(pad = -1)
plt.show()
distri_clusters(X_scaled_prime,'kmeans_label')
En observant les courbes du distorsion score et du silhouette score on en déduit que la segmentation Kmean de notre dataset réduit aux variables rfm est optimale avec 4 clusters. Essayons un autre algorithme de clustering.
# extraction d'un échantillon
df_sample = rfm.sample(frac= 1, random_state= 42)[:10000]
# Standardisation et normalisation du dataset
X1=df_sample
sc = RobustScaler()
X_scaled = sc.fit_transform(X1)
X_scaled=pd.DataFrame(X_scaled,columns=['recence','frequence','montant'])
X_scaled.head()
| recence | frequence | montant | |
|---|---|---|---|
| 0 | 0.556522 | 0.0 | -0.070283 |
| 1 | -0.521739 | 0.0 | -0.101678 |
| 2 | -0.065217 | 0.0 | 0.287219 |
| 3 | -0.560870 | 0.0 | 1.043063 |
| 4 | -0.778261 | 0.0 | -0.942408 |
# recherche du meilleur paramètre epsilon
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X_scaled)
distances, indices = nbrs.kneighbors(X_scaled)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x1e29fad6c70>]
y_clusters_db = DBSCAN(eps = 0.05, min_samples=38).fit_predict(X_scaled)
nuage_points_3d(X_scaled, y_clusters_db, 10, 10, "recence", "frequence", "montant", -150, 50)
db = DBSCAN(eps=0.05, min_samples=38).fit(X_scaled)
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
Estimated number of clusters: 4 Estimated number of noise points: 9798
# Numpy array of all the cluster labels assigned to each data point
db_default = DBSCAN(eps = 0.05, min_samples = 38).fit(X_scaled)
labels = db_default.labels_
# Step 5: Building the clustering model
from sklearn import metrics
labels_true = X_scaled.index
# Numpy array of all the cluster labels assigned to each data point
db_default = DBSCAN(eps = 0.05, min_samples = 38).fit(X_scaled)
core_samples_mask = np.zeros_like(db_default.labels_, dtype=bool)
core_samples_mask[db_default.core_sample_indices_] = True
labels = db_default.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.4f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.4f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.4f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.4f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.4f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.4f"
% metrics.silhouette_score(X_scaled, labels))
Estimated number of clusters: 4 Estimated number of noise points: 9798 Homogeneity: 0.0137 Completeness: 1.0000 V-measure: 0.0271 Adjusted Rand Index: 0.0000 Adjusted Mutual Information: 0.0000 Silhouette Coefficient: -0.4984
La segmentation avec DBscan n'est pas satisfaisante. En effet, pour obtenir un nombre de clusters exploitable on engendre un bruit important.
rfm2= df_synt.groupby('customer_unique_id').agg({'recence': 'min', 'order_id': 'size',
'payment_value' : 'sum','Satisfaction_num' : 'mean'})
rfm2.rename(columns={'order_id':'frequence','payment_value' : 'montant'}, inplace=True)
# Traitement des outliers
for i in [0, 3]:
outlier_indices = []
col = rfm2.columns[i]
percentile_5 = np.percentile(rfm2[col], 5)
percentile_95 = np.percentile(rfm2[col], 95)
outlier_indices.append(rfm2[(rfm2[col] < percentile_5) | (rfm2[col] > percentile_95)].index)
rfm2.drop(outlier_indices[0][:], inplace= True)
rfm2.reset_index(inplace= True, drop= True)
# Transformation logarithmique des variables fréquence et montant
for i in ['montant']:
rfm2[i] = np.log(rfm2[i]+1)
X2=rfm2
# pipeline preprocessing+kmeans
model= make_pipeline(RobustScaler(), KMeans(n_clusters=4,n_init=10,random_state=42))
model.fit(X2)
model.predict(X2)
y_clusters = model.fit_predict(X2)
model.score(X2)
-103739.9014567529
# visualisation avec une ACP
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X2)
pca = PCA(n_components=3)
pca.fit(X_scaled)
X_proj = pca.transform(X_scaled)
X_proj = pd.DataFrame(X_proj, columns = ["PC1", "PC2", "PC3"])
X_proj
| PC1 | PC2 | PC3 | |
|---|---|---|---|
| 0 | -0.212099 | -0.950511 | 0.194460 |
| 1 | -1.434241 | -0.794261 | -0.090141 |
| 2 | 0.034936 | 2.526357 | -1.257903 |
| 3 | -1.091011 | 0.466339 | 0.428543 |
| 4 | 0.030871 | 0.133074 | 0.625947 |
| ... | ... | ... | ... |
| 92876 | 3.123728 | 0.833130 | 1.714591 |
| 92877 | -0.599946 | 0.040821 | 0.420893 |
| 92878 | -0.391347 | 1.937912 | 1.116516 |
| 92879 | -0.256745 | -0.895280 | 0.200720 |
| 92880 | -0.726904 | 1.448434 | 0.859312 |
92881 rows × 3 columns
nuage_points_3d(X_proj, y_clusters, 10, 10, "PC1", "PC2", "PC3", 0, 90)
nuage_points_2d(X2.recence, X2.montant, y_clusters)
C:\Users\BRANCHET\AppData\Local\Temp\ipykernel_23556\1575981532.py:5: UserWarning: The palette list has more values (7) than needed (4), which may not be intended.
distorsion_visualizer = make_pipeline(RobustScaler(), KElbowVisualizer(KMeans(n_init=10,random_state=42), k=(1,12)))
distorsion_visualizer.fit(X2) # Fit the data to the visualizer
distorsion_visualizer.named_steps['kelbowvisualizer'].show()
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
calinski_visualizer = make_pipeline(RobustScaler(), KElbowVisualizer(KMeans(n_init=10,random_state=42),
k=(2,12),
metric='calinski_harabasz',
timings=False ))
calinski_visualizer.fit(X2) # Fit the data to the visualizer
calinski_visualizer.named_steps['kelbowvisualizer'].show()
<Axes: title={'center': 'Calinski Harabasz Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='calinski harabasz score'>
La courbe du silhouette score mets en évidence un "coude" à 4 ou 5 clusters. Poussons notre analyse avec des courbes supplémentaires.
# Instantiate the clustering model and visualizer
K = distorsion_visualizer.named_steps['kelbowvisualizer'].elbow_value_
silhouette_visualizer = make_pipeline(RobustScaler(), SilhouetteVisualizer(KMeans(n_clusters=K,
n_init=10,
random_state=42)))
silhouette_visualizer.fit(X2) # Fit the data to the visualizer
silhouette_visualizer.named_steps['silhouettevisualizer'].show() # Draw/show/poof the data
<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 92881 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# Instantiate the clustering model and visualizer
K = distorsion_visualizer.named_steps['kelbowvisualizer'].elbow_value_
silhouette_visualizer = make_pipeline(RobustScaler(), SilhouetteVisualizer(KMeans(n_clusters=5,
n_init=10,
random_state=42)))
silhouette_visualizer.fit(X2) # Fit the data to the visualizer
silhouette_visualizer.named_steps['silhouettevisualizer'].show() # Draw/show/poof the data
<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 92881 Samples in 5 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
Les clusters sont mieux répartis dans la configuration à quatre groupes.
# Intercluster distance Map with best k
distance_visualizer = make_pipeline(RobustScaler(), InterclusterDistance(KMeans(n_clusters=4, n_init=10,random_state=42), k=(1,12)))
distance_visualizer.fit(X2)
distance_visualizer.named_steps['interclusterdistance'].show()
C:\Users\BRANCHET\anaconda3\lib\site-packages\sklearn\manifold\_mds.py:601: UserWarning: The MDS API has changed. ``fit`` now constructs an dissimilarity matrix from data. To use a custom dissimilarity matrix, set ``dissimilarity='precomputed'``. C:\Users\BRANCHET\anaconda3\lib\site-packages\sklearn\manifold\_mds.py:299: FutureWarning: The default value of `normalized_stress` will change to `'auto'` in version 1.4. To suppress this warning, manually set the value of `normalized_stress`.
<Axes: title={'center': 'KMeans Intercluster Distance Map (via MDS)'}, xlabel='PC2', ylabel='PC1'>
# Intercluster distance Map with best k
distance_visualizer = make_pipeline(RobustScaler(), InterclusterDistance(KMeans(n_clusters=5, n_init=10,random_state=42), k=(1,12)))
distance_visualizer.fit(X2)
distance_visualizer.named_steps['interclusterdistance'].show()
C:\Users\BRANCHET\anaconda3\lib\site-packages\sklearn\manifold\_mds.py:299: FutureWarning: The default value of `normalized_stress` will change to `'auto'` in version 1.4. To suppress this warning, manually set the value of `normalized_stress`.
<Axes: title={'center': 'KMeans Intercluster Distance Map (via MDS)'}, xlabel='PC2', ylabel='PC1'>
# KMeans Pipeline with best K
kmeans_model_first = make_pipeline(RobustScaler(), KMeans(n_clusters=K,n_init=10,random_state=42))
kmeans_model_first.fit(X2)
y_clusters = kmeans_model_first.fit_predict(X2)
# Kmeans labels
kmeans_labels_first = kmeans_model_first.named_steps['kmeans'].labels_
# Scale X
scaler = RobustScaler()
X_scaled_first = scaler.fit_transform(X2)
X_scaled_first = pd.DataFrame(X_scaled_first, index=X2.index, columns=X2.columns)
X_scaled_first["kmeans_label"] = kmeans_labels_first
# Group by cluster
X_scaled_clusters_first = X_scaled_first.groupby("kmeans_label").mean()
# Plot Radar chart
plot_radars(X_scaled_clusters_first, "kmeans_label")
distri_clusters(X_scaled_first,'kmeans_label')
La segmentation est plus performante avec 4 Clusters
# Group by customers with sum or mean
data = df_synt.groupby("customer_unique_id")\
.agg({'recence': 'min', 'order_id': "nunique", 'payment_value' : 'sum',
'Satisfaction_num' : 'mean',
"freight_value": "sum",
"nb_payment_sequential": "mean",
"sum_payment_installments": "mean",
"delta_order_delivered_customer_date": "mean",
"sale_month": lambda x:x.value_counts().index[0],
"distance_client_vendeur": "mean"})
# Rename columns
data = data.rename(columns={'order_id':'frequence','payment_value' : 'montant',
"freight_value": "total_freight",
"nb_payment_sequential": "mean_payment_sequential",
"sum_payment_installments": "mean_payment_installments",
"delta_order_delivered_customer_date": "mean_delivery_days",
"sale_month": "favorite_sale_month"})
data.head()
| recence | frequence | montant | Satisfaction_num | total_freight | mean_payment_sequential | mean_payment_installments | mean_delivery_days | favorite_sale_month | distance_client_vendeur | |
|---|---|---|---|---|---|---|---|---|---|---|
| customer_unique_id | ||||||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 111 | 1 | 141.90 | 1.0 | 12.00 | 1.0 | 8.0 | 6.0 | 5 | 110.528468 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 114 | 1 | 27.19 | 1.0 | 8.29 | 1.0 | 1.0 | 3.0 | 5 | 22.233428 |
| 0000f46a3911fa3c0805444483337064 | 537 | 1 | 86.22 | -1.0 | 17.22 | 1.0 | 8.0 | 26.0 | 3 | 517.016952 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 321 | 1 | 43.62 | 1.0 | 17.63 | 1.0 | 4.0 | 20.0 | 10 | 2481.241232 |
| 0004aac84e0df4da2b147fca70cf8255 | 288 | 1 | 196.89 | 1.0 | 16.89 | 1.0 | 6.0 | 13.0 | 11 | 154.518198 |
# Traitement des outliers
for i in [0, 8]:
outlier_indices = []
col = data.columns[i]
percentile_5 = np.percentile(data[col], 5)
percentile_95 = np.percentile(data[col], 95)
outlier_indices.append(data[(data[col] < percentile_5) | (data[col] > percentile_95)].index)
data.drop(outlier_indices[0][:], inplace= True)
data_kmean_fin=data
data_kmean_fin.to_csv ('data_kmean_fin.csv', index = False)
data.reset_index(inplace= True, drop= True)
# Valeurs manquantes
data=data.dropna(subset=['distance_client_vendeur'], how='all')
# Distribution des variables après transformation logarithmique
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (15, 10))
sns.histplot(data['total_freight'], kde = True, ax = ax[0,0])
sns.histplot(data['montant'], kde = True, ax = ax[0,1])
sns.histplot(data['mean_payment_installments'], kde = True, ax = ax[0,2])
sns.histplot(data['distance_client_vendeur'], kde = True, ax = ax[1,0])
sns.histplot(data['mean_delivery_days'], kde = True, ax = ax[1,1])
sns.histplot(data['recence'], kde = True, ax = ax[1,2])
plt.suptitle('Distribution of features', fontsize = 15)
plt.tight_layout(pad = 1)
plt.show()
# Distribution des variables après transformation logarithmique
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (15, 10))
sns.histplot(np.log(data['total_freight']+1), kde = True, ax = ax[0,0])
sns.histplot(np.log(data['montant']+1), kde = True, ax = ax[0,1])
sns.histplot(np.log(data['mean_payment_installments']+1), kde = True, ax = ax[0,2])
sns.histplot(np.log(data['distance_client_vendeur']+1), kde = True, ax = ax[1,0])
sns.histplot(np.log(data['mean_delivery_days']+1), kde = True, ax = ax[1,1])
sns.histplot(np.log(data['recence']+1), kde = True, ax = ax[1,2])
plt.suptitle('Distribution of features', fontsize = 15)
plt.tight_layout(pad = 1)
plt.show()
# Transformation logarithmique des variables
for i in ['mean_delivery_days', 'montant','distance_client_vendeur','total_freight']:
data[i] = np.log(data[i]+1)
C:\Users\BRANCHET\AppData\Local\Temp\ipykernel_23556\3279108772.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X3=data
# pipeline preprocessing+kmeans
model= make_pipeline(RobustScaler(), KMeans(n_clusters=4,n_init=10,random_state=42))
model.fit(X3)
model.predict(X3)
y_clusters = model.fit_predict(X3)
model.score(X3)
-317592.57865533733
distorsion_visualizer = make_pipeline(RobustScaler(),
KElbowVisualizer(KMeans( n_init=10,random_state=42),
k=(1,12)))
distorsion_visualizer.fit(X3) # Fit the data to the visualizer
distorsion_visualizer.named_steps['kelbowvisualizer'].show()
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
calinski_visualizer = make_pipeline(RobustScaler(), KElbowVisualizer(KMeans(n_init=10,random_state=42),
k=(2,12),
metric='calinski_harabasz',
timings=False ))
calinski_visualizer.fit(X3) # Fit the data to the visualizer
calinski_visualizer.named_steps['kelbowvisualizer'].show()
<Axes: title={'center': 'Calinski Harabasz Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='calinski harabasz score'>
La courbe du distorsion score est plus significative nous choisirons donc une segmentation avec 4 clusters.
# Instantiate the clustering model and visualizer
K = distorsion_visualizer.named_steps['kelbowvisualizer'].elbow_value_
silhouette_visualizer = make_pipeline(RobustScaler(),
SilhouetteVisualizer(KMeans(n_clusters=K,
n_init=10,
random_state=42)))
silhouette_visualizer.fit(X3) # Fit the data to the visualizer
silhouette_visualizer.named_steps['silhouettevisualizer'].show() # Draw/show/poof the data
<Axes: title={'center': 'Silhouette Plot of KMeans Clustering for 92628 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# Intercluster distance Map with best k
distance_visualizer = make_pipeline(RobustScaler(),
InterclusterDistance(KMeans(n_clusters=K,
n_init=10,
random_state=42),
k=(2,12)))
distance_visualizer.fit(X3)
distance_visualizer.named_steps['interclusterdistance'].show()
C:\Users\BRANCHET\anaconda3\lib\site-packages\sklearn\manifold\_mds.py:299: FutureWarning: The default value of `normalized_stress` will change to `'auto'` in version 1.4. To suppress this warning, manually set the value of `normalized_stress`.
<Axes: title={'center': 'KMeans Intercluster Distance Map (via MDS)'}, xlabel='PC2', ylabel='PC1'>
On remarque que les différents clusters sont bien distincts sur les 2 premières composantes . Le clustering semble donc performant et il faut a présent donner une interprétation métier de chaque cluster.
Nous allons maintenant entrainer notre KMeans avec le K optimal sélectionné et affecter son cluster à chaque client. Ainsi, nous pourrons analyser les différences entre chaque cluster :
# KMeans Pipeline with best K
kmeans_model_bis = make_pipeline(RobustScaler(), KMeans(n_clusters=K,n_init=10,random_state=42))
kmeans_model_bis.fit(X3)
y_clusters = kmeans_model_bis.fit_predict(X3)
# Kmeans labels
kmeans_labels_bis = kmeans_model_bis.named_steps['kmeans'].labels_
# Scale X
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X3)
X_scaled = pd.DataFrame(X_scaled, index=X3.index, columns=X3.columns)
X_scaled["kmeans_label"] = kmeans_labels_bis
# Group by cluster
X_scaled_clusters_bis = X_scaled.groupby("kmeans_label").mean()
X_scaled.head()
| recence | frequence | montant | Satisfaction_num | total_freight | mean_payment_sequential | mean_payment_installments | mean_delivery_days | favorite_sale_month | distance_client_vendeur | kmeans_label | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.463203 | 0.0 | 0.196753 | 0.0 | -0.601047 | 0.0 | 2.000000 | -0.599631 | -0.2 | -0.952128 | 3 |
| 1 | -0.450216 | 0.0 | -1.220276 | 0.0 | -1.158167 | 0.0 | -0.333333 | -1.342052 | -0.2 | -2.047630 | 3 |
| 2 | 1.380952 | 0.0 | -0.234255 | -2.0 | -0.041341 | 0.0 | 2.000000 | 1.191265 | -0.6 | 0.120356 | 1 |
| 3 | 0.445887 | 0.0 | -0.819383 | 0.0 | -0.004444 | 0.0 | 0.666667 | 0.857855 | 0.8 | 1.214616 | 0 |
| 4 | 0.303030 | 0.0 | 0.480971 | 0.0 | -0.071647 | 0.0 | 1.333333 | 0.319940 | 1.0 | -0.719937 | 0 |
X_scaled_clusters_bis
| recence | frequence | montant | Satisfaction_num | total_freight | mean_payment_sequential | mean_payment_installments | mean_delivery_days | favorite_sale_month | distance_client_vendeur | |
|---|---|---|---|---|---|---|---|---|---|---|
| kmeans_label | ||||||||||
| 0 | 0.117743 | 0.016190 | -0.149536 | -0.004480 | 0.095280 | 0.023939 | 0.007371 | 0.122588 | 0.009108 | 0.157172 |
| 1 | 0.119106 | 0.018588 | 0.030965 | -1.979400 | 0.256501 | 0.033286 | 0.167609 | 0.602612 | -0.053199 | 0.109398 |
| 2 | 0.079659 | 0.105835 | 1.025929 | -0.321876 | 1.338055 | 0.127502 | 1.489453 | 0.200600 | 0.031427 | 0.196665 |
| 3 | -0.047127 | 0.016150 | -0.313504 | -0.250297 | -0.808790 | 0.021450 | 0.066091 | -0.866682 | 0.014041 | -1.708682 |
# Plot Radar chart
plot_radars(X_scaled_clusters_bis, "kmeans_label")
data_output = data.copy()
data_output['Cluster'] = y_clusters
data_output.sample(5)
| recence | frequence | montant | Satisfaction_num | total_freight | mean_payment_sequential | mean_payment_installments | mean_delivery_days | favorite_sale_month | distance_client_vendeur | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 40302 | 198 | 1 | 5.196285 | -1.0 | 3.072693 | 1.0 | 2.0 | 2.944439 | 2 | 6.685253 | 1 |
| 25746 | 204 | 1 | 4.657383 | 1.0 | 2.795450 | 1.0 | 5.0 | 2.890372 | 2 | 6.155758 | 0 |
| 22024 | 279 | 1 | 4.264650 | 1.0 | 2.781301 | 1.0 | 1.0 | 2.944439 | 11 | 6.744722 | 0 |
| 86043 | 126 | 1 | 4.433551 | 1.0 | 2.961658 | 1.0 | 7.0 | 2.197225 | 4 | 6.395095 | 0 |
| 19545 | 184 | 1 | 5.113793 | -1.0 | 3.835142 | 1.0 | 1.0 | 3.367296 | 2 | 5.875831 | 1 |
def distri_clusters(df,variable):
sns.countplot(data = df, x = variable, palette = 'viridis_r')
plt.title('Cluster Sizes', fontsize = 15)
plt.xlabel('Clusters', fontsize = 14)
plt.ylabel('No. of Customers', fontsize = 14)
plt.xticks([0, 1, 2, 3], ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3'], rotation = 0)
plt.text(x = 0 - 0.2, y = df[variable].value_counts()[0] + 500, s = df[variable].value_counts()[0])
plt.text(x = 1 - 0.2, y = df[variable].value_counts()[1] + 500, s = df[variable].value_counts()[1])
plt.text(x = 2 - 0.15, y = df[variable].value_counts()[2] + 500, s = df[variable].value_counts()[2])
plt.text(x = 3 - 0.12, y = df[variable].value_counts()[3] + 500, s = df[variable].value_counts()[3])
#plt.text(x = 4 - 0.05, y = data_output.Cluster.value_counts()[4] + 500, s = data_output.Cluster.value_counts()[4])
plt.tight_layout(pad = -1)
plt.show()
distri_clusters(X_scaled,'kmeans_label')
Nous allons à présent réaliser une réduction dimensionnelle pour vérifier si le clustering est réalisable sur un nombre réduit de variables sans perturber les groupes
# PCA Pipeline
pca = make_pipeline( RobustScaler(),
PCA(svd_solver='full'))
pca.fit(X3)
X_projected = pca.transform(X3)
# Explained variance
varexpl = pca.named_steps['pca'].explained_variance_ratio_*100
# Plot of cumulated variance
plt.figure(figsize=(12,8))
plt.bar(np.arange(len(varexpl))+1, varexpl)
cumSumVar = varexpl.cumsum()
plt.plot(np.arange(len(varexpl))+1, cumSumVar,c="red",marker='o')
plt.axhline(y=95, linestyle="--",
color="green",
linewidth=1)
limit = 95
valid_idx = np.where(cumSumVar >= limit)[0]
min_plans = valid_idx[cumSumVar[valid_idx].argmin()]+1
plt.axvline(x=min_plans, linestyle="--",
color="green",
linewidth=1)
plt.xlabel("rang de l'axe d'inertie")
plt.xticks(np.arange(len(varexpl))+1)
plt.ylabel("pourcentage d'inertie")
plt.title("{}% de la variance totale est expliquée"\
" par les {} premiers axes".format(limit,
min_plans))
plt.show(block=False)
def cerle_corr(pcs, n_comp, pca, axis_ranks,
labels=None, label_rotation=0):
fig=plt.figure(figsize=(20,n_comp*5))
count=1
for d1, d2 in axis_ranks:
if d2 < n_comp:
# initialisation de la figure
#fig.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)
ax=plt.subplot(int(n_comp/2),2,count)
ax.set_aspect('equal', adjustable='box')
#détermination des limites du graphique
ax.set_xlim(-1,1)
ax.set_ylim(-1,1)
#affichage des flèches
ax.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
pcs[d1,:],pcs[d2,:],
angles='xy', scale_units='xy', scale=1,
color="grey", alpha=0.5)
# et noms de variables
for i,(x,y) in enumerate(pcs[[d1,d2]].T):
ax.annotate(labels[i],(x,y),
ha='center', va='center',
fontsize='14',color="#17aafa", alpha=0.8)
#ajouter les axes
ax.plot([-1,1],[0,0],linewidth=1, color='grey', ls='--')
ax.plot([0,0],[-1,1],linewidth=1, color='grey', ls='--')
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='#17aafa',fill=False)
ax.add_artist(cercle)
# nom des axes, avec le pourcentage d'inertie expliqué
ax.set_xlabel('F{} ({}%)'.format(d1+1,
round(100*pca.explained_variance_ratio_[d1],1)))
ax.set_ylabel('F{} ({}%)'.format(d2+1,
round(100*pca.explained_variance_ratio_[d2],1)))
ax.set_title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1))
count+=1
plt.suptitle("Cercles des corrélations sur les {} premiers axes".format(n_comp),
y=.9, color="blue", fontsize=18)
plt.show(block=False)
# Principal component space
pcs = pca.named_steps['pca'].components_
# Plot correlation circle
cerle_corr(pcs,
8,
pca.named_steps['pca'],
[(0,1),(2,3),(4,5),(5,6)],
labels = np.array(X3.columns))
On peut voir les variables qui contribuent le plus à chaque axe. Par exemple:
Nous allons donc intégrer à notre pipeline Kmeans une PCA sur 5 composantes pour vérifier si la réduction dimensionnelle réduit la qualité de la segmentation
# KMeans Pipeline with best K for PCA results
kmeans_model_pca = Pipeline([("preprocessor", RobustScaler()),
("kmeans", KMeans(n_clusters=K,n_init=10,random_state=42))])
kmeans_model_pca.fit(X_projected[:,:7])
y_clusters = kmeans_model_pca.fit_predict(X_projected[:,:4])
# Kmeans labels
pca_kmeans_labels = kmeans_model_pca.named_steps['kmeans'].labels_
X_scaled["kmeans_label_pca"] = pca_kmeans_labels
X_scaled_clusters_pca = X_scaled.groupby("kmeans_label_pca").mean()
X_scaled_clusters_pca.iloc[:,:-1]
| recence | frequence | montant | Satisfaction_num | total_freight | mean_payment_sequential | mean_payment_installments | mean_delivery_days | favorite_sale_month | distance_client_vendeur | |
|---|---|---|---|---|---|---|---|---|---|---|
| kmeans_label_pca | ||||||||||
| 0 | 0.070753 | 0.041072 | 0.012025 | -0.009700 | 0.362944 | 0.046642 | -0.048946 | 0.147130 | 0.007197 | 0.206412 |
| 1 | 0.090581 | 0.026192 | 0.170944 | -1.980500 | 0.335170 | 0.042970 | 0.333482 | 0.426904 | -0.040165 | -0.109287 |
| 2 | 0.256897 | 0.032089 | 0.512640 | -0.053619 | 0.405145 | 0.058876 | 1.894254 | 0.100287 | 0.052494 | 0.041006 |
| 3 | -0.066761 | 0.021603 | -0.290779 | -0.020862 | -0.719762 | 0.026880 | -0.029056 | -0.882283 | 0.008478 | -1.670233 |
clustering_eval(preprocessor=RobustScaler(),
model=KMeans(n_init=10,random_state=42),
data=X_projected[:,:7],
metric="distortion",
elbow=False,
mds=True,
KBest=K)
C:\Users\BRANCHET\anaconda3\lib\site-packages\sklearn\manifold\_mds.py:299: FutureWarning: The default value of `normalized_stress` will change to `'auto'` in version 1.4. To suppress this warning, manually set the value of `normalized_stress`.
4
plot_radars(df=X_scaled_clusters_pca.iloc[:,:-1],
group="kmeans_label_pca")
distri_clusters(X_scaled,'kmeans_label_pca')
Nous avons segmenté notre dataset clients en utilisant les algorithmes K-Means et DBSCAN, les resultats tournent autour de 4 à 5 clusters, mais le plus probant est le modele K-Means qui donne des clusters bien définis et distincts. Par ailleurs, La réduction de dimension ne perturbe pas la segmentation, permet de réduire le nombre de variables mais n'améliore pas le silhouette-score. Quelques pistes d'amélioration: